In [1]:
#to make life easier
%load_ext autoreload
%autoreload 2
In [2]:
import autograd.numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings

import plotly.offline as py
py.init_notebook_mode(connected=False)
import plotly.graph_objs as go
In [3]:
from src.models import Model, Prior, Conditional_model
from src.sampling import Metropolis_Hastings as MH
from src.helpers import samples_exploration
from src.optimization import gradient_descent as GD

Load the data

In [4]:
df = pd.read_csv("data/wages.dat", 
                 sep="\s+")

#normalization of continuous variables
df.LNWAGE = (df.LNWAGE - np.mean(df.LNWAGE.values))/np.std(df.LNWAGE.values)
df.ED =(df.ED - np.mean(df.ED.values))/np.std(df.ED.values)
df.EX =(df.EX - np.mean(df.EX.values))/np.std(df.EX.values)

#dropping correlated variables
df = df.drop(columns = ["EXSQ","AGE"], axis = 1)


#building predictors and response matrices
predictors = df[['ED', 'SOUTH', 'NONWH', 'HISP', 'FE', 'MARR', 'MARRFE', 'EX',
        'UNION', 'MANUF', 'CONSTR', 'MANAG', 'SALES', 'CLER',
        'SERV', 'PROF']]
y = df.LNWAGE.values
X = predictors.values

#splitting train and test
X, X_test, Y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Create the models

In [5]:
model_gaussian = Model.Model(Prior.Gaussian_prior,Conditional_model.Gaussian, Prior = [0,3,2], data = X, response = Y)
model_student = Model.Model(Prior.Student_prior, Conditional_model.Gaussian, Prior = [0,3,2,1/4], data = X, response = Y)

Obtaining estimates

using MAP

In [6]:
theta_gd_student = GD.vanilla_gd(model_student, max_iter= 1000)
theta_gd_gaussian = GD.vanilla_gd(model_gaussian, max_iter= 1000)
Progress : [====================] 100% Done...
Progress : [====================] 100% Done...
 

Using Metropolis Hastings

In [7]:
with warnings.catch_warnings(record=True):
    initial = np.random.randn(model_gaussian.beta_size+1)
    initial[0]=0.5
    samples_gaussian = MH.random_walk_MH(model_gaussian, max_iter = 20000, verbose = True, step_size = 0.035, initial = initial)
    initial[0]=20
    samples_student = MH.random_walk_MH(model_student, max_iter = 20000, verbose = True, step_size = 0.035, initial = initial)
Metropolis Hasting started at: 2019-05-30 15:12:14
Progress : [====================] 100% Done...
  Acceptance rate : 27.7%  (advised values between 10% and 50%)
  duration: 0:00:10
Metropolis Hasting started at: 2019-05-30 15:12:24
Progress : [====================] 100% Done...
  Acceptance rate : 32.2%  (advised values between 10% and 50%)
  duration: 0:00:10
In [8]:
with warnings.catch_warnings(record=True):    
    print("---------------------------------  gaussian samples-----------------------------------")
    samples_exploration(samples_gaussian[2500:], correlation= False)
---------------------------------  gaussian samples-----------------------------------
iterations
estimation of the distributions
In [9]:
with warnings.catch_warnings(record=True):    
    print("---------------------------------  student samples-----------------------------------")
    samples_exploration(samples_student[2500:], correlation= False)
---------------------------------  student samples-----------------------------------
iterations
estimation of the distributions

Comparison of the results

we use a burn in period of 2500

In [10]:
theta_MH_student = np.mean(samples_student[2500:],axis=0)
theta_MH_gaussian = np.mean(samples_gaussian[2500:],axis=0)
In [11]:
beta_OLS = np.linalg.inv(X.T@X)@X.T@Y
sigma_OLS = (Y-X@beta_OLS).T@(Y-X@beta_OLS)/(len(Y)-len(beta_OLS))
theta_OLS = np.insert(beta_OLS,0,sigma_OLS)
In [15]:
trace0 = go.Scatter(
    y = theta_gd_student,
    name = 'student, gd',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = 'steelblue'
    )
)
trace1 = go.Scatter(
    y = theta_MH_student,
    name = 'student, MH',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = 'steelblue',
        line = dict(
            width = 2, color = "black")
    )
)
trace2 = go.Scatter(
    y = theta_gd_gaussian,
    name = 'gaussian, gd',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = 'red',
    )
)
trace3 = go.Scatter(
    y = theta_MH_gaussian,
    name = 'gaussian, MH',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = 'red',
        line = dict(
            width = 2, color = "black")
    )
)

trace4 = go.Scatter(
    y = theta_OLS,
    name = 'OLS estimate',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = 'orange'
    )
)



data = [trace0, trace1, trace2, trace3, trace4]

layout = dict(title = 'Result comparison',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False)
             )

fig = dict(data=data, layout=layout)
py.iplot(fig)
In [13]:
inter = Y
classes = inter.copy()
classes[Y >= 1] = 1
classes[Y < 1 ]= 2
classes[Y < 0] = 3
classes[Y < -1] = 4

flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
def convert(n): 
    return flatui[int(n)]
colors = list(map(convert, classes) )
plt.scatter(np.arange(0,X.shape[0]),Y, c = colors)
plt.show()
In [14]:
inter = y_test
classes = inter.copy()
classes[y_test >= 1] = 1
classes[y_test < 1 ]= 2
classes[y_test < 0] = 3
classes[y_test < -1] = 4

flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
def convert(n): 
    return flatui[int(n)]
colors = list(map(convert, classes) )
plt.scatter(np.arange(0,X_test.shape[0]),y_test, c = colors)
plt.show()